# Alex Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(tidylog)
library(here)
library(readxl)
library(sf)
library(fixest)

source(here("R", "visibility", "replication_mode.R"))
source(here("analysis", "visibility", "processing", "survey_weights.R"))

# Configuration----
data_year <- 2023 # Year for merging; lagged by 1 year

# Load survey data - use geo file if available, otherwise use non-geo version
# In anonymized mode, the geo file is removed to protect respondent privacy
# Geo file is in data/inter/ (regenerated in full mode from cache + geocoding)
# Non-geo file is in data/cache/ (base processed survey data)
geo_file <- here("data", "inter", "survey_visibility_processed_with_geo.rds")
nongeo_file <- here("data", "cache", "survey_visibility_processed.rds")

if (file.exists(geo_file)) {
  message("Loading survey data WITH coordinates (full mode)")
  g <- readRDS(geo_file)
  HAS_COORDINATES <- TRUE
} else if (file.exists(nongeo_file)) {
  message("Loading survey data WITHOUT coordinates (anonymized mode)")
  message("DMA assignment will be skipped; using cached distance calculations")
  g <- readRDS(nongeo_file)
  HAS_COORDINATES <- FALSE
  
  # In anonymized mode, load cached FIPS assignments for joining with covariates
  # These are county-level identifiers needed for BEA, broadband, elections data
  # Also includes loc_anom for geo_subset robustness analysis
  if (fips_cache_exists()) {
    message("Loading cached FIPS assignments for covariate merging")
    fips_cache <- read_csv(get_fips_cache_path(), show_col_types = FALSE)
    g <- left_join(g, fips_cache, by = "response_id")
    message("  Added columns: fips, fips.pre, fips.bea, county, state, loc_anom")
  } else {
    stop("Anonymized mode requires FIPS cache for covariate joining. File not found: ", 
         get_fips_cache_path())
  }
} else {
  stop("No survey data file found. Run process_survey.R first.")
}

# Load and prepare data----
message("=== LOAD AND PREPARE DATA ===")
projdist <- readRDS(get_distance_cache_path())

broadband <- read_csv(here("data", "inter", "broadband_county_processed.csv"), show_col_types = FALSE)
broadband$bb100 <- relevel(factor(broadband$bb100), ref = "0-400")
broadband <- broadband %>% 
  filter(year == data_year) %>% 
  select(-year) %>%
  select(-c(year_lag, fips))

bea <- read_csv(here("data", "inter", "bea_county_processed.csv"), show_col_types = FALSE)
bea <- bea %>% 
  filter(year == data_year) %>% 
  select(-year) %>%
  select(fips.bea:income_pc) %>%
  distinct()

pres <- read_csv(here("data", "inter", "pres_elections_2020.csv"), show_col_types = FALSE)

eprice <- read_csv(here("data", "inter", "industrial_electricity_prices_state.csv"), show_col_types = FALSE)
eprice <- eprice %>%
  filter(year == data_year) %>%
  filter(!is.na(state.abb) | state == "District of Columbia") %>%
  select(-c(year_lag, state.abb))

interstate <- read_csv(here("data", "inter", "highway_counties_processed.csv"), show_col_types = FALSE)

emp <- read_csv(here("data", "inter", "emp_county_processed.csv"), show_col_types = FALSE)
emp <- emp %>%
  filter(year == data_year) %>%
  select(-c(year, year_lag, fips))

union <- read_csv(here("data", "inter", "union_state_processed.csv"), show_col_types = FALSE)
union <- union %>% 
  filter(year == data_year) %>% 
  select(-c(year, year_lag, state.abb))

acs <- read_csv(here("data", "inter", "acs2023_county_processed.csv"), show_col_types = FALSE)
popd <- read_csv(get_popdensity_cache_path(), show_col_types = FALSE)

gov <- read_xlsx(here("data", "input", "govparty", "governors_2024.xlsx"), progress = FALSE)
gov <- subset(gov, select = c(state_full, party))
gov <- rename(gov, govparty = party)

# Code DC as having Democratic governor
gov <- rbind(gov, data.frame(state_full = "District of Columbia", govparty = "D"))

# Merge data----
message("=== MERGE DATA ===")

m1 <- left_join(g, projdist, by = "response_id")

m2 <- left_join(m1, broadband, by = c("fips" = "fips.post"))

m3 <- left_join(m2, bea, by = "fips.bea")

m4 <- left_join(m3, pres, by = c("fips.pre" = "fips"))

m5 <- left_join(m4, eprice, by = "state")

m6 <- left_join(m5, interstate, "fips.pre")

m7 <- left_join(m6, emp, by = c("fips" = "fips.post"))

m8 <- left_join(m7, union, by = "state")

acs$fips[acs$fips == 9001] <- 9120
m10 <- left_join(m8, acs, by = "fips")

m11 <- left_join(m10, popd, by = "response_id")

# Add DMA spatial join ----
message("=== SPATIAL JOIN WITH DMA DATA ===")

# Load DMA data (from consolidated dma.R script - problematic DMAs already filtered)
dma <- readRDS(here("data", "inter", "dma_processed.rds"))
message("Loaded DMA data with ", nrow(dma), " markets")

# Check DMA coverage
dma_bbox <- st_bbox(dma)
message("DMA coverage: Longitude ", round(dma_bbox[1], 1), "° to ", round(dma_bbox[3], 1), 
        "°, Latitude ", round(dma_bbox[2], 1), "° to ", round(dma_bbox[4], 1), "°")
message("Includes Alaska: ", dma_bbox[1] < -140, ", Hawaii: ", dma_bbox[2] < 20)

# Load the original spatial survey data which has lon_zip and lat_zip coordinates
# In anonymized mode, this file doesn't exist - skip DMA spatial assignment
if (HAS_COORDINATES && file.exists(here("data", "inter", "survey_visibility_processed_with_geo.rds"))) {
  survey_spatial <- readRDS(here("data", "inter", "survey_visibility_processed_with_geo.rds"))
} else {
  survey_spatial <- g  # Use the already-loaded data
}

# Check if coordinates exist
if (HAS_COORDINATES && "lon_zip" %in% names(survey_spatial) && "lat_zip" %in% names(survey_spatial)) {
  
  # Create spatial points from survey coordinates
  survey_points <- survey_spatial %>%
    filter(!is.na(lon_zip) & !is.na(lat_zip)) %>%
    select(response_id, lon_zip, lat_zip) %>%
    st_drop_geometry() %>%  # Remove existing geometry if any
    st_as_sf(coords = c("lon_zip", "lat_zip"), crs = 4326, remove = FALSE)
  
  message("Created spatial points for ", nrow(survey_points), " survey respondents with coordinates")
  
  # Ensure both datasets are in the same CRS (WGS84)
  dma <- st_transform(dma, 4326)
  survey_points <- st_transform(survey_points, 4326)
  
  # Perform spatial join to assign DMA to each survey respondent
  survey_with_dma <- st_join(survey_points, dma, join = st_within)
  
  message("Completed spatial join")
  message("Rows before duplicate handling: ", nrow(survey_with_dma))
  
  # Check for and handle duplicates
  duplicates <- survey_with_dma %>%
    st_drop_geometry() %>%
    group_by(response_id) %>%
    filter(n() > 1) %>%
    ungroup()
  
  if (nrow(duplicates) > 0) {
    message("Warning: Found ", nrow(duplicates), " duplicate DMA assignments")
    message("This can happen when respondents are on DMA boundaries")
    
    # For duplicates, keep the DMA with the smallest code (arbitrary but consistent)
    survey_with_dma_dedup <- survey_with_dma %>%
      st_drop_geometry() %>%
      group_by(response_id) %>%
      arrange(dma_code) %>%
      slice(1) %>%
      ungroup()
  } else {
    survey_with_dma_dedup <- survey_with_dma %>% st_drop_geometry()
  }
  
  message("Rows after duplicate handling: ", nrow(survey_with_dma_dedup))
  
  # Extract just the DMA information  
  dma_assignments <- survey_with_dma_dedup %>%
    select(response_id, dma_code, dma_name, market_size) %>%
    rename(
      dma = dma_code,
      dma_name = dma_name,
      dma_market_size = market_size
    )
  
  # Check coverage
  n_with_dma <- sum(!is.na(dma_assignments$dma))
  n_total <- nrow(dma_assignments)
  message("Successfully assigned DMA to ", n_with_dma, " out of ", n_total, " respondents with coordinates")
  message("DMA assignment coverage: ", round(n_with_dma/n_total * 100, 1), "%")
  
  # Show DMA distribution
  if (n_with_dma > 0) {
    dma_summary <- dma_assignments %>%
      filter(!is.na(dma)) %>%
      count(dma_market_size, sort = TRUE)
    message("DMA market size distribution:")
    for (i in 1:nrow(dma_summary)) {
      message("  ", dma_summary$dma_market_size[i], ": ", dma_summary$n[i], " respondents")
    }
  }
  
  # Cache DMA assignments for anonymized mode
  dma_cache_file <- here("data", "cache", "dma_assignments.csv")
  dir.create(dirname(dma_cache_file), recursive = TRUE, showWarnings = FALSE)
  write.csv(dma_assignments, dma_cache_file, row.names = FALSE)
  message("[OK] Cached DMA assignments to: ", dma_cache_file)
  
} else {
  # Anonymized mode: load cached DMA assignments (required)
  dma_cache_file <- here("data", "cache", "dma_assignments.csv")
  if (file.exists(dma_cache_file)) {
    message("Loading cached DMA assignments from: ", dma_cache_file)
    dma_assignments <- read.csv(dma_cache_file, stringsAsFactors = FALSE)
    n_with_dma <- sum(!is.na(dma_assignments$dma))
    message("[OK] Loaded ", nrow(dma_assignments), " DMA assignments (", n_with_dma, " with valid DMA)")
  } else {
    stop("DMA cache not found at: ", dma_cache_file, "\n",
         "The anonymized replication package requires pre-computed DMA assignments.\n",
         "This cache should be generated by running the pipeline in full mode first,\n",
         "then copying data/cache/dma_assignments.csv to the replication package.\n",
         "See README.md for instructions on creating the replication package.")
  }
}

# Merge DMA assignments back to main dataset
m11_with_dma <- left_join(m11, dma_assignments, by = "response_id")

# Use the dataset with DMA for subsequent joins
m12 <- left_join(m11_with_dma, gov, by = c("state" = "state_full"))

g <- st_drop_geometry(m12)

# Code variables----
message("=== CREATING DERIVED VARIABLES FOR ANALYSIS ===")

# Create swing state indicator
swing_states <- c("Georgia", "Arizona", "Michigan", "Wisconsin", "Pennsylvania", "Nevada", "North Carolina")
g$swing <- ifelse(g$state %in% swing_states, 1, 0)

# Sample variable
g$sample <- factor(g$sample)

# Education
g$edu_hsless <- ifelse(g$edu3 == "High School or less", 1, 0)
g$edu_ba <- ifelse(g$edu3 == "BA or higher", 1, 0)
g$edu_some <- ifelse(g$edu3 == "Some College", 1, 0)

# Broadband
g$bb100 <- droplevels(g$bb100)
g$bb100_0to400 <- ifelse(g$bb100 == "0-400", 1, 0)
g$bb100_400to600 <- ifelse(g$bb100 == "400-600", 1, 0)
g$bb100_600to800 <- ifelse(g$bb100 == "600-800", 1, 0)
g$bb100_over800 <- ifelse(g$bb100 == ">800", 1, 0)

# Standardize covariates with within-state variance (for county-level variables)
covar.std <- c(
  "urate", "force_ln", "gdp_ln", "income_pc", "college_acs", "poverty_acs",
  "housing_acs", "popd", "demshare", "foreign_acs"
)

for (covar in covar.std) {
  g <- std_within_state(covar, data = g)
}

# Standardize state-level variables with full-sample z-scores
# (within-state standardization not applicable for state-level vars)
g <- g %>%
  mutate(
    union_mem_z = (union_mem - mean(union_mem, na.rm = TRUE)) / sd(union_mem, na.rm = TRUE),
    eprice_z = (eprice - mean(eprice, na.rm = TRUE)) / sd(eprice, na.rm = TRUE)
  )

# Income label (for heterogeneity analysis)
g$income_bin_lab <- ifelse(g$income_bin == 1, "Above median", "Below median")

# Population density label (convert binary to label)
g$popd_abovemed <- ifelse(g$popd_abovemed, "Above median", "Below median")

g$credit_page_submit <- as.numeric(g$credit_page_submit)

# Credit all actors indicator
g$credit_all <- as.integer(with(g, credit_biden_bin + credit_cong_bin + credit_gov_bin + credit_state_bin + credit_com_bin + credit_market_bin == 6))

# Create "not at all responsible" binary indicators
g <- g %>%
  mutate(
    across(c(credit_biden, credit_cong, credit_gov, credit_state, credit_com, credit_market),
           ~ ifelse(.x == "Not at all responsible", 1, 0), .names = "{col}_none_bin"
    )
  )
g$credit_none <- as.integer(with(g, credit_biden_none_bin +
                             credit_cong_none_bin +
                             credit_gov_none_bin +
                             credit_state_none_bin +
                             credit_com_none_bin +
                             credit_market_none_bin == 6))

# Create "extremely responsible" binary indicators
g <- g %>%
  mutate(
    across(c(credit_biden, credit_cong, credit_gov, credit_state, credit_com, credit_market),
           ~ ifelse(.x == "Extremely responsible", 1, 0), .names = "{col}_extreme_bin"
    )
  )
g$credit_extreme <- as.integer(g$credit_biden_extreme_bin +
                             g$credit_cong_extreme_bin +
                             g$credit_gov_extreme_bin +
                             g$credit_state_extreme_bin +
                             g$credit_com_extreme_bin +
                             g$credit_market_extreme_bin == 6)

# Within-subject comparison variables
g$biden_vs_gov <- as.integer(g$credit_biden_bin - g$credit_gov_bin == 1)
g$biden_vs_state <- as.integer(g$credit_biden_bin - g$credit_state_bin == 1)

# Biden county indicator (based on 2020 vote share)
g$biden_county <- as.integer(g$demshare > 50)

# Calculate sample weights-----
message("=== CALCULATING WEIGHTS ===")

# Prepare survey data for weights calculation
g <- prepare_survey_for_weighting(g)

# Calculate weights for full sample
full_sample_weights <- calculate_survey_weights(g, verbose = TRUE)
g$wt_all <- full_sample_weights$weights
g$wt_all_trim <- full_sample_weights$weights_trimmed

# Get weights for subset where credit attribution question is available
g.credit <- g %>%
  filter(!is.na(credit_biden_bin))
credit_sample_weights <- calculate_survey_weights(g.credit, verbose = TRUE)
g.credit$wt_credit <- credit_sample_weights$weights
g.credit$wt_credit_trim <- credit_sample_weights$weights_trimmed
g.credit <- subset(g.credit, select = c(response_id, wt_credit, wt_credit_trim))
summary(g.credit)

g <- left_join(g, g.credit, by = "response_id")

# Get weights for subset where green project benefits question is available
g.green <- g %>%
  filter(!is.na(greenbenefit))
green_sample_weights <- calculate_survey_weights(g.green, verbose = TRUE)
g.green$wt_green <- green_sample_weights$weights
g.green$wt_green_trim <- green_sample_weights$weights_trimmed
g.green <- subset(g.green, select = c(response_id, wt_green, wt_green_trim))
summary(g.green)

g <- left_join(g, g.green, by = "response_id")

# Get weights for subset where green project type question is available
g.green_type <- g %>%
  filter(!is.na(proj_solar) | !is.na(proj_wind) | !is.na(proj_geo) | !is.na(proj_bat) | !is.na(proj_ev) | !is.na(proj_h))
green_type_sample_weights <- calculate_survey_weights(g.green_type, verbose = TRUE)
g.green_type$wt_green_type <- green_type_sample_weights$weights
g.green_type$wt_green_type_trim <- green_type_sample_weights$weights_trimmed
g.green_type <- subset(g.green_type, select = c(response_id, wt_green_type, wt_green_type_trim))
summary(g.green_type)

g <- left_join(g, g.green_type, by = "response_id")

# Save output----
message("Final dataset has ", nrow(g), " rows")
message("Final dataset has ", ncol(g), " columns")

saveRDS(g, here("data", "output", "visibility_analysis.rds"))
message("[OK] Saved final dataset to data/output/visibility_analysis.rds")